Reading and Filtering Data

Let us start with loading packages:

library("Seurat")
## Attaching SeuratObject
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library("matrixStats")
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
library("ggplot2")

Now we will load Skeletal Muscle scRNAseq un-normalized data and have a look:

expr <- suppressWarnings(as.data.frame(fread("SkeletalMuscle_HumanChimp_10X.txt",sep="\t")))
rownames(expr)<-expr$V1; expr$V1<-NULL;
expr <- expr[rowSums(expr) != 0,]
#expr <- expr[rowMeans(as.matrix(expr)) >= 0.1,]
expr[1:5,1:5]
##               C00001_chimp C00004_human C00005_human C00006_chimp C00007_chimp
## RP11.34P13.7             0            0            0            0            0
## FO538757.2               0            0            0            0            0
## AP006222.2               0            0            0            0            0
## RP4.669L17.10            0            0            0            0            0
## RP5.857K21.4             0            0            0            0            0
print(paste0("DATASET CONTAINS ",dim(expr)[1]," GENES AND ",dim(expr)[2]," CELLS, AMONG THEM THERE ARE ",sum(grepl("human",colnames(expr)))," HUMAN CELLS AND ", sum(grepl("chimp",colnames(expr))), " CHIMP CELLS"))
## [1] "DATASET CONTAINS 18104 GENES AND 6578 CELLS, AMONG THEM THERE ARE 2935 HUMAN CELLS AND 3643 CHIMP CELLS"

First we are going to look at which genes are most expressed in our data set:

head(sort(rowSums(expr), decreasing=TRUE), 20)
##     MALAT1      NEAT1        TTN        NEB      ACTA1      EMC10      TNNC2 
##    1017013      53361      45022      18865      18366      16743      14244 
##     MYBPC1       TRDN      TNNT1      TNNT3 MIR133A1HG       TCAP       MYH7 
##      11994      10839      10835      10462       7948       6867       6432 
##      ABCA5       TPM2     PDLIM3         MB       NEXN      TNNC1 
##       6329       6142       5901       5753       5634       5526
barplot(log10(as.numeric(sort(rowSums(expr), decreasing=TRUE)) + 1), xlim=c(0,20), names = names(sort(rowSums(expr), decreasing=TRUE)),ylab="LOG10(EXPR)",las=2)

MALAT1 is a nuclear lincRNA which is very abundant. Typically in scRNAseq experiments, high expression of MALAT1 indicates that the cellular membrane is broken and cytoplasmic mRNA starts leaking out. Some studies show that MALAT1 expression is correlated with the mitochondrial contamination. However, in our case of snRNAseq, high expression of MALAT1 is to be expected. Still common practice is to remove this gene from the data set because of unknown reasons for such a high expression. Additional reason is that high expression of MALAT1 can be hard for normalization algorithms. So here we are going to remove MALAT1 genes from the further downstream analysis:

expr<-expr[-which(rownames(expr)=="MALAT1"),]

What about ribosomal gene families expression (RPS and RPL genes)? Their expression is also considered to be a contamination. Let us first display the ribosomal genes:

rownames(expr)[grepl(paste0(c("^RPS","^RPL"),collapse="|"),rownames(expr))]
##  [1] "RPL22"     "RPL11"     "RPS6KA1"   "RPS8"      "RPL5"      "RPS27"    
##  [7] "RPS6KC1"   "RPS7"      "RPS27A"    "RPL31"     "RPL37A"    "RPL32"    
## [13] "RPL15"     "RPSA"      "RPL14"     "RPL29"     "RPL24"     "RPL22L1"  
## [19] "RPL39L"    "RPL35A"    "RPL9"      "RPL34.AS1" "RPL34"     "RPS3A"    
## [25] "RPL37"     "RPS23"     "RPS14"     "RPL26L1"   "RPS18"     "RPS10"    
## [31] "RPL10A"    "RPL7L1"    "RPS12"     "RPS6KA2"   "RPS6KA3"   "RPS4X"    
## [37] "RPS6KA6"   "RPL36A"    "RPL39"     "RPL10"     "RPS20"     "RPL7"     
## [43] "RPL30"     "RPL8"      "RPS6"      "RPL35"     "RPL12"     "RPL7A"    
## [49] "RPLP2"     "RPL27A"    "RPS13"     "RPS6KA4"   "RPS6KB2"   "RPS3"     
## [55] "RPS25"     "RPS24"     "RPS26"     "RPL41"     "RPL6"      "RPLP0"    
## [61] "RPL21"     "RPS29"     "RPL36AL"   "RPS6KL1"   "RPS6KA5"   "RPS27L"   
## [67] "RPL4"      "RPLP1"     "RPS17"     "RPL3L"     "RPS2"      "RPS15A"   
## [73] "RPL13"     "RPL26"     "RPL23A"    "RPL23"     "RPL19"     "RPL27"    
## [79] "RPS6KB1"   "RPL38"     "RPL17"     "RPS21"     "RPS15"     "RPL36"    
## [85] "RPS28"     "RPL18A"    "RPS16"     "RPS19"     "RPL18"     "RPL13A"   
## [91] "RPS11"     "RPS9"      "RPL28"     "RPS5"      "RPS4Y1"    "RPS4Y2"   
## [97] "RPL3"      "RPS19BP1"

Let us now rank all cells by the percentage of ribosomal genes expressed in them:

ribosom_genes_expr_matrix <- expr[rownames(expr)[grepl(paste0(c("^RPS","^RPL"),collapse="|"),rownames(expr))],]
percent_ribosom_expr <- colSums(ribosom_genes_expr_matrix) / colSums(expr)
head(sort(percent_ribosom_expr, decreasing=TRUE), 20)
## C00558_human C06743_human C06698_human C05295_human C09079_human C08692_human 
##    0.2023121    0.1824818    0.1698113    0.1634241    0.1579618    0.1562500 
## C09111_human C06378_human C05025_human C07328_human C01087_human C06977_human 
##    0.1546961    0.1538462    0.1525626    0.1504425    0.1500000    0.1481481 
## C06938_human C01750_human C02122_human C01186_chimp C05326_chimp C09223_human 
##    0.1455696    0.1453397    0.1438596    0.1433447    0.1428571    0.1402878 
## C07894_human C01150_human 
##    0.1388889    0.1382488
barplot(sort(percent_ribosom_expr, decreasing=TRUE), xlim=c(0,1000),las=2)

It looks like we have quite a few nuceli with high (up to 19%) of their UMI due to ribosomal gene families. What can it mean? Well, ribosomal proteins are house keeping genes that may not be clearly related to cell type. The ribosomal genes are often removed to make clustering of cell types more transparrent as they have more or less similar expression across cells and thus do not contribute to cell type identification. In this analysis we will remove the ribosomal protein genes:

expr<-expr[grepl(paste0(c("^RPS","^RPL"),collapse="|"),rownames(expr))==FALSE,]

Now let us display mitochondrial genes:

rownames(expr)[grepl("^MT\\.",rownames(expr))]
##  [1] "MT.ND1"  "MT.ND2"  "MT.CO1"  "MT.CO2"  "MT.ATP8" "MT.ATP6" "MT.CO3" 
##  [8] "MT.ND3"  "MT.ND4L" "MT.ND4"  "MT.ND5"  "MT.ND6"  "MT.CYB"

Presence of those genes usually imply contamination, i.e. when cell membrane is broken, the nuclear RNA floats away but the mitochondrial RNA is still in tact. In our case, if we see nuclei with mitochondrial genes expressed, this would mean that mitochondrial got stuck to the nuclei somehow during the nuclei library prep. Anyhow, nuclei with mitochondrial genes expressed should be filtered out. Let us rank all cell by the percentage of mitochondrial genes expressed:

mito_genes_expr_matrix <- expr[rownames(expr)[grepl("^MT\\.",rownames(expr))],]
percent_mito_expr <- colSums(mito_genes_expr_matrix) / colSums(expr)
head(sort(percent_mito_expr, decreasing=TRUE), 20)
## C05741_human C01562_human C05007_human C05938_human C07894_human C08074_chimp 
##    0.2860360    0.2067039    0.1985112    0.1927711    0.1720430    0.1638225 
## C08927_chimp C07328_human C04358_human C07646_human C08821_chimp C08692_human 
##    0.1577947    0.1562500    0.1530612    0.1530055    0.1495327    0.1481481 
## C09347_human C09682_human C07050_human C06743_human C00533_chimp C00251_chimp 
##    0.1476510    0.1465517    0.1448276    0.1428571    0.1340782    0.1319149 
## C08516_human C06620_human 
##    0.1318182    0.1317829
barplot(sort(percent_mito_expr, decreasing=TRUE), xlim=c(0,1000),las=2)

It seems that quite a few cells have high (up to 25%) percentage of their library sizes due to mitochndrial contamination. Let us display the names of the cells with >5 of their library size contributed by mitochondrial gene expression.

names(percent_mito_expr[percent_mito_expr>0.05])
##   [1] "C00077_human" "C00251_chimp" "C00371_human" "C00403_human" "C00493_chimp"
##   [6] "C00499_human" "C00502_human" "C00533_chimp" "C00677_human" "C00951_human"
##  [11] "C01549_human" "C01562_human" "C01671_chimp" "C01718_human" "C01750_human"
##  [16] "C01900_chimp" "C01929_human" "C02064_human" "C02088_human" "C02173_human"
##  [21] "C02203_human" "C02669_human" "C02704_human" "C02860_human" "C03073_human"
##  [26] "C03148_chimp" "C03175_chimp" "C03306_chimp" "C03317_human" "C03343_chimp"
##  [31] "C03448_human" "C03589_human" "C03745_chimp" "C03847_human" "C03885_chimp"
##  [36] "C04037_human" "C04070_human" "C04107_human" "C04317_chimp" "C04358_human"
##  [41] "C04425_chimp" "C04713_human" "C04772_chimp" "C04816_human" "C04817_human"
##  [46] "C05007_human" "C05012_human" "C05025_human" "C05068_human" "C05082_human"
##  [51] "C05295_human" "C05305_human" "C05492_human" "C05719_human" "C05741_human"
##  [56] "C05757_human" "C05833_chimp" "C05938_human" "C05992_chimp" "C06000_chimp"
##  [61] "C06034_chimp" "C06136_chimp" "C06196_chimp" "C06245_human" "C06406_human"
##  [66] "C06420_human" "C06435_chimp" "C06444_human" "C06578_human" "C06620_human"
##  [71] "C06698_human" "C06720_human" "C06724_human" "C06734_human" "C06743_human"
##  [76] "C06822_human" "C06841_human" "C06938_human" "C06996_human" "C07050_human"
##  [81] "C07127_human" "C07182_chimp" "C07239_chimp" "C07274_chimp" "C07328_human"
##  [86] "C07411_human" "C07418_human" "C07434_human" "C07448_chimp" "C07626_human"
##  [91] "C07636_chimp" "C07646_human" "C07672_human" "C07863_human" "C07894_human"
##  [96] "C07921_human" "C07979_human" "C08026_human" "C08074_chimp" "C08101_human"
## [101] "C08182_chimp" "C08217_human" "C08428_chimp" "C08435_human" "C08454_human"
## [106] "C08516_human" "C08543_human" "C08572_chimp" "C08651_human" "C08692_human"
## [111] "C08821_chimp" "C08915_human" "C08927_chimp" "C09079_human" "C09111_human"
## [116] "C09171_human" "C09223_human" "C09336_human" "C09347_human" "C09361_human"
## [121] "C09431_human" "C09511_human" "C09536_human" "C09598_human" "C09659_chimp"
## [126] "C09682_human" "C09708_chimp" "C09757_human" "C09860_human" "C09867_human"
sum(grepl("human",names(percent_mito_expr[percent_mito_expr>0.05]))==TRUE)/sum(grepl("human",names(percent_mito_expr)))
## [1] 0.03270869
sum(grepl("chimp",names(percent_mito_expr[percent_mito_expr>0.05]))==TRUE)/sum(grepl("chimp",names(percent_mito_expr)))
## [1] 0.009332967

We can see that the majority of contaminated cells come from the human, i.e. approximately 3% of human cells are contaminated while only less than 1% of chimp cells demonstrate high mitochondrial gene expression. Here we are going to filter out the contaminated nuclei with >1% of their UMIs due to mitochondrial genes:

expr[,names(percent_mito_expr[percent_mito_expr>0.01])] <- NULL
print(paste0("DATASET CONTAINS ",dim(expr)[1]," GENES AND ",dim(expr)[2]," CELLS, AMONG THEM THERE ARE ",sum(grepl("human",colnames(expr)))," HUMAN CELLS AND ", sum(grepl("chimp",colnames(expr))), " CHIMP CELLS"))
## [1] "DATASET CONTAINS 18005 GENES AND 5668 CELLS, AMONG THEM THERE ARE 2402 HUMAN CELLS AND 3266 CHIMP CELLS"

Now everything is ready for running the Seurat pipeline. We should not see lots of cells with mitochondrial contamination at the end. Lastly, we will remove the chimp cells because in this notebook we are going to concentrate on human skeletal muscle nuclei.

expr<-subset(expr,select=colnames(expr)[grepl("human",colnames(expr))])
expr <- expr[rowSums(expr) != 0,]
expr[1:5,1:5]
##               C00004_human C00009_human C00011_human C00014_human C00018_human
## RP11.34P13.7             0            0            0            0            0
## FO538757.2               0            0            0            0            0
## AP006222.2               0            0            0            0            0
## RP4.669L17.10            0            0            0            0            0
## RP5.857K21.4             0            0            0            0            0
print(paste0("DATASET CONTAINS ",dim(expr)[1]," GENES AND ",dim(expr)[2]," HUMAN CELLS"))
## [1] "DATASET CONTAINS 15416 GENES AND 2402 HUMAN CELLS"

Performing Quality Control

The steps below encompass the standard pre-processing workflow for scRNA-seq data in Seurat. These represent the creation of a Seurat object, the selection and filtration of cells based on QC metrics, data normalization and scaling, and the detection of highly variable features.

In order to proceed with Seurat pipeine we will have to convert the un-normalized expression matrix into Seurat object. We will keep all features expressed in >= 1 cells (~0.02% of the data) and keep all cells with at least 1 detected features. Those are the minimal requirements, so we are trying to be as permissive as possible here since we have already previously removed genes that are not expressed in any cell.

skm <- CreateSeuratObject(counts = expr, project = "SKM", min.cells = 1, min.features = 1, meta.data=data.frame(row.names=colnames(expr),orig.ident=matrix(unlist(strsplit(colnames(expr),"_")),ncol=2,byrow=TRUE)[,2]))
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
skm@active.ident<-as.factor(skm$orig.ident)
skm
## An object of class Seurat 
## 15416 features across 2402 samples within 1 assay 
## Active assay: RNA (15416 features, 0 variable features)

The number of features and UMIs (nFeature_RNA and nCount_RNA) are automatically calculated for every object by Seurat. For non-UMI data, nCount_RNA represents the sum of the non-normalized values within a cell i.e. library size. nFeature_RNA is a number of non-zero counts per cell, i.e. the number of expressed genes per cell. We calculate the percentage of mitochondrial features here and store it in object metadata as percent.mito (there should be not so many since we have previously removed the mitochondrial genes). We use raw count data since this represents non-transformed and non-log-normalized counts. The % of UMI mapping to MT-features is a common scRNA-seq QC metric.

mito.features <- grep(pattern = "^MT\\.", x = rownames(x = skm), value = TRUE)

# Calculate % of mitochondrial genes per cell
percent.mito <- Matrix::colSums(x = GetAssayData(object = skm, slot = 'counts')[mito.features, ]) / Matrix::colSums(x = GetAssayData(object = skm, slot = 'counts'))

# The [[ operator can add columns to object metadata, and is a great place to stash QC stats
skm[['percent.mito']] <- percent.mito
VlnPlot(object = skm, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"), ncol = 3)

Thus we have in average ~200 genes detected per cell while a typical library size is 300-350. This is not a fantastic coverage but still tolerable for 10X snRNAseq data. This is possibly due to too many unspliced transcripts in the nuclei that can't be counted if one only counts reads aligned to exonic regions (this is how counting is typically done). Good news is that we do not seem to have many detected mitochondrial genes, i.e. they contribute less than 1% of the library sizes of the cells. Now we will check how nFeature_RNA is connected with nCount_RNA:

FeatureScatter(object = skm, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

We observe a strong correlation and conclude that larger library sizes (nCount_RNA) give more detected genes per cell (nFeature_RNA). This agrees well with what is typically know for scRNAseq experiments.

Now we will normalize the data. By default, Seurat employs a global-scaling normalization method “LogNormalize” that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.

skm <- NormalizeData(object = skm, normalization.method = "LogNormalize", scale.factor = 1e4)

Now we will identify Highly Variable Genes (HVGs) wich will be used for dimensionality reduction. FindVariableFeatures calculates the average expression and dispersion for each feature, places these features into bins, and then calculates a z-score for dispersion within each bin. This helps to control for the relationship between variability and average expression. Since we have a low-coverage data, below, we will keep all genes, i.e. all genes will be considered as highly-variable, otherwise we end up with too few genes.

#skm <- FindVariableFeatures(object = skm, selection.method = 'mean.var.plot', mean.cutoff = c(0.0125, 3), dispersion.cutoff = c(0.5, Inf))
skm <- FindVariableFeatures(object = skm, nfeatures = dim(skm)[1])
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -3.0795
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.30103
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1.5028e-14
length(x = VariableFeatures(object = skm))
## [1] 15416

Thus we have 15416 Highly Variable Genes (HVGs) which we will use for further downstream analysis. So we kept all available genes for getting a better resolution to distingusih between different cell types.

Principal Component Analysis (PCA)

The single cell dataset likely contains "uninteresting" sources of variation. This could include not only technical noise, but batch effects, or even biological sources of variation (cell cycle stage). To mitigate the effect of these signals, Seurat constructs linear models to predict feature expression based on user-defined variables. The scaled z-scored residuals of these models are stored in the scale.data slot, and are used for dimensionality reduction and clustering.

We can regress out cell-cell variation in feature expression driven by batch (if applicable), cell alignment rate (as provided by Drop-seq tools for Drop-seq data), the number of detected molecules, and mitochondrial feature expression. For cycling cells, we can also learn a "cell-cycle" score and regress this out as well. Here we regress on the number of detected molecules per cell.

skm <- ScaleData(object = skm, features = rownames(x = skm), vars.to.regress = c("nCount_RNA","percent.mito"))
## Regressing out nCount_RNA, percent.mito
## Centering and scaling data matrix

Next we will perform linear Dimensionality Reduction which is a Principal Component Analysis (PCA) in this case:

skm <- RunPCA(object = skm, features = VariableFeatures(object = skm), verbose = FALSE)

We can see genes contributing to the loadings of the principal components. Let us now visualize loadings for the first two PCs:

print(x = skm[['pca']], dims = 1:2, nfeatures = 10, projected = FALSE)
## PC_ 1 
## Positive:  EMC10, TTN, NEAT1, MYBPC1, NEB, TRDN, CTD.2545M3.8, MIR133A1HG, CMYA5, MIR1.1HG 
## Negative:  COL6A2, COL6A1, HSPG2, CFD, TNXB, MEG3, SPARCL1, APOD, ABCA8, HMCN2 
## PC_ 2 
## Positive:  TNNT3, ATP2A1, MYH2, MYLPF, TPM1, MYH1, TNNI2, GGT7, MYBPC2, TNNC2 
## Negative:  MYH7B, ATP2A2, TNNT1, TPM3, MYH7, XPO4, TNNI1, TECRL, MYBPC1, CD36
VizDimLoadings(object = skm, dims = 1:2)

It looks like the highly-expressed genes in skeletal muscles contribute to the PC1, while MYH and ATP genes that discriminate between type 1 and type 2 fibers contribute to PC2. So it looks like PC1 separates muscle nuclei from non-muscle nuclei, while PC2 is responsible for the difference between slow and fast twitch skeletal muscle fibers.

Here we will display the PCA plot itself:

DimPlot(object = skm, group.by="orig.ident", pt.size=1.2)

skm <- ProjectDim(object = skm)
## PC_ 1 
## Positive:  EMC10, TTN, NEAT1, MYBPC1, NEB, TRDN, CTD.2545M3.8, MIR133A1HG, CMYA5, MIR1.1HG 
##     NEXN, ACTA1, PDLIM3, ATP2A1, TNNT3, PDLIM5, PDE4DIP, RHOBTB1, NRAP, MYOT 
## Negative:  COL6A2, COL6A1, HSPG2, CFD, TNXB, MEG3, SPARCL1, APOD, ABCA8, HMCN2 
##     DCN, GSN, SCN7A, NAV1, PTGDS, NOVA1, IGFBP7, LTBP4, COL15A1, SPARC 
## PC_ 2 
## Positive:  TNNT3, ATP2A1, MYH2, MYLPF, TPM1, MYH1, TNNI2, GGT7, MYBPC2, TNNC2 
##     ENO3, MYL1, RHOBTB1, FAM166B, PFKM, ACTB, CASQ1, COL4A3, AHCYL1, CTD.2545M3.8 
## Negative:  MYH7B, ATP2A2, TNNT1, TPM3, MYH7, XPO4, TNNI1, TECRL, MYBPC1, CD36 
##     USP54, FEZ2, PFKFB2, TNNC1, IL17D, MYL2, CCDC39, AQP7, ATP1B4, PLIN5 
## PC_ 3 
## Positive:  HLA.E, BTNL9, FABP4, EGFL7, CD74, RNASE1, CAV1, ITGA1, HLA.B, HIF3A 
##     VWF, IFITM3, SOX18, EFNB2, B2M, ADGRF5, TMSB10, STOM, XAF1, TM4SF1 
## Negative:  COL6A1, CFD, TNXB, NOVA1, ABCA8, APOD, COL6A2, SCN7A, PTGDS, DCN 
##     MEG3, LSP1, HMCN2, ITIH5, COL1A2, COL15A1, LTBP4, LRP1, ADAM33, SMOC2 
## PC_ 4 
## Positive:  FBLN2, MEG3, TNNT3, MYH1, ATP2A1, SCN7A, HMCN2, SPTBN1, MYLPF, COL1A2 
##     FN1, COL6A1, HSPG2, DDR2, MFAP5, LRP1, MYH2, CD63, CEP126, ADH1B 
## Negative:  MOCOS, DUSP14, AC004158.1, C15orf38.AP3S2, RP11.30K9.6, GSG1, TSPYL4, STON1, MFAP1, GLIS3 
##     ANKRD30BL, MANF, STEAP2, TTF2, DNAH3, SMURF1, SERAC1, POMZP3, RCAN1, MAP3K13 
## PC_ 5 
## Positive:  BTNL9, PLXDC2, F3, HSPG2, CFD, PRPF3, TM4SF1, SCN7A, RBM15B, MEGF8 
##     SOX18, NDRG1, APOL1, KLF2, RUNX1T1, NEAT1, CYBA, IFI44, GDI2, NUAK1 
## Negative:  PDGFRB, SPARC, RGS5, NOTCH3, NR2F2, NDUFA4L2, IFITM3, IGFBP7, RHOB, COL5A3 
##     TAGLN, SPARCL1, EGFLAM, PMEPA1, HDAC2, SLC4A11, XRCC5, TXN2, PPIA, ACTA2
DimHeatmap(object = skm, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(object = skm, dims = 2, cells = 500, balanced = TRUE)

DimHeatmap(object = skm, dims = 1:9, cells = 500, balanced = TRUE)

We can see from the heatmap for PC1 that there are genes like COL6A2, COL6A1, HSPG2, CFD, TNXB, MEG3, SPARCL1, APOD, ABCA8, HMCN2, DCN, GSN, SCN7A, NAV1, PTGDS, NOVA1, IGFBP7, LTBP4, COL15A1, SPARC that are expressed in a small group of nuclei, while skeletal muscle specific genes like EMC10, TTN, NEAT1, MYBPC1, NEB, TRDN, CTD.2545M3.8, MIR133A1HG, CMYA5, MIR1.1HG, NEXN, ACTA1, PDLIM3, ATP2A1, TNNT3, PDLIM5, PDE4DIP, RHOBTB1, NRAP, MYOT seem to be depleted from this small group of cells. This again confirms that this small group of cells are probably not skeletal muscle nuclei at all.

The next step is to decide how many principal components to keep for further downstream analysis. This step is another filtering step, in this way we eliminate noisy genes and keep only most informative ones. PC selection can be viewed as identifying the true dimensionality of a dataset

Seurat uses a resampling test inspired by the "JackStraw" procedure. It randomly permutes a subset of the data (1% by default) and rerun PCA, constructing a ‘null distribution’ of gene scores, and repeat this procedure. We identify ‘significant’ PCs as those who have a strong enrichment of low p-value genes.

skm <- JackStraw(object = skm, num.replicate = 100)
skm <- ScoreJackStraw(object = skm, dims = 1:10)
JackStrawPlot(object = skm, dims = 1:10)
## Warning: Removed 128937 rows containing missing values (geom_point).

ElbowPlot(object = skm)

Seurat provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). ‘Significant’ PCs will show a strong enrichment of genes with low p-values (solid curve above the dashed line). In this case it appears that PCs 1-3 are significant.

Cell Cycle Assignment

Having created the PCA plot we can now use known cell cycle gene markers in order to assign a cell cycle status to each cell in order to see if the cycling cells form a separate cluster. A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. We can segregate this list into markers of G2/M phase and markers of S phase:

cc.genes
## $s.genes
##  [1] "MCM5"     "PCNA"     "TYMS"     "FEN1"     "MCM2"     "MCM4"    
##  [7] "RRM1"     "UNG"      "GINS2"    "MCM6"     "CDCA7"    "DTL"     
## [13] "PRIM1"    "UHRF1"    "MLF1IP"   "HELLS"    "RFC2"     "RPA2"    
## [19] "NASP"     "RAD51AP1" "GMNN"     "WDR76"    "SLBP"     "CCNE2"   
## [25] "UBR7"     "POLD3"    "MSH2"     "ATAD2"    "RAD51"    "RRM2"    
## [31] "CDC45"    "CDC6"     "EXO1"     "TIPIN"    "DSCC1"    "BLM"     
## [37] "CASP8AP2" "USP1"     "CLSPN"    "POLA1"    "CHAF1B"   "BRIP1"   
## [43] "E2F8"    
## 
## $g2m.genes
##  [1] "HMGB2"   "CDK1"    "NUSAP1"  "UBE2C"   "BIRC5"   "TPX2"    "TOP2A"  
##  [8] "NDC80"   "CKS2"    "NUF2"    "CKS1B"   "MKI67"   "TMPO"    "CENPF"  
## [15] "TACC3"   "FAM64A"  "SMC4"    "CCNB2"   "CKAP2L"  "CKAP2"   "AURKB"  
## [22] "BUB1"    "KIF11"   "ANP32E"  "TUBB4B"  "GTSE1"   "KIF20B"  "HJURP"  
## [29] "CDCA3"   "HN1"     "CDC20"   "TTK"     "CDC25C"  "KIF2C"   "RANGAP1"
## [36] "NCAPD2"  "DLGAP5"  "CDCA2"   "CDCA8"   "ECT2"    "KIF23"   "HMMR"   
## [43] "AURKA"   "PSRC1"   "ANLN"    "LBR"     "CKAP5"   "CENPE"   "CTCF"   
## [50] "NEK2"    "G2E3"    "GAS2L3"  "CBX5"    "CENPA"
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

First, we assign each cell a score, based on its expression of G2/M and S phase markers. These marker sets should be anticorrelated in their expression levels, and cells expressing neither are likely not cycling and in G1 phase.

skm <- CellCycleScoring(object = skm, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
## Warning: The following features are not present in the object: TYMS, DTL,
## MLF1IP, WDR76, RAD51, RRM2, CDC45, CDC6, EXO1, DSCC1, BRIP1, not searching for
## symbol synonyms
## Warning: The following features are not present in the object: CDK1, UBE2C,
## TOP2A, NDC80, NUF2, MKI67, FAM64A, CCNB2, AURKB, BUB1, KIF11, GTSE1, HJURP,
## CDCA3, CDC20, TTK, CDC25C, KIF2C, DLGAP5, CDCA2, CDCA8, KIF23, PSRC1, ANLN,
## CENPE, NEK2, GAS2L3, CENPA, not searching for symbol synonyms
# view cell cycle scores and phase assignments
head(x = skm[[]])
##              orig.ident nCount_RNA nFeature_RNA percent.mito      S.Score
## C00004_human      human        254          200  0.000000000 -0.014212486
## C00009_human      human        202          157  0.004950495 -0.012058636
## C00011_human      human        214          172  0.004672897  0.120798081
## C00014_human      human        199          142  0.005025126 -0.015129670
## C00018_human      human        214          174  0.000000000 -0.008913617
## C00024_human      human        473          341  0.002114165 -0.033846850
##                 G2M.Score Phase old.ident
## C00004_human -0.021091762    G1     human
## C00009_human -0.011184614    G1     human
## C00011_human -0.014697865     S     human
## C00014_human -0.007484297    G1     human
## C00018_human -0.014697865    G1     human
## C00024_human  0.223544415   G2M     human
table(skm[[]]$Phase)
## 
##   G1  G2M    S 
## 1796  320  286
skm@meta.data$old.ident<-as.factor(skm@active.ident)

Good news is that the vast majority of cells do not seem to be cycling, i.e. they are in the G1 phase. Around 1600 cells though seem to be in G2M / S phase, which is not that bad, but the most important is to check now that those ~1600 cells do not form a separate cluster but spread uniformly through the cell lineage clusters.

skm <- RunPCA(object = skm, features = VariableFeatures(object = skm), verbose = FALSE)
DimPlot(object = skm, pt.size=1.2)

Well, we do not seem to observe a separate cluster for the cycling cells on the PCA plot, the G2M and S cells seem to be spread across the PCA plot. What about tSNE, will we see the cycling cells cluster there?

Dimensionality Reduction and Clustering

Seurat uses graph-based clustering algorithm similar to Shared Nearest Neighbor (SNN). This method embeds cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar gene expression patterns, and then attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’. We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default).

skm <- FindNeighbors(object = skm, dims = 1:3)
## Computing nearest neighbor graph
## Computing SNN
skm <- FindClusters(object = skm, resolution = 0.1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2402
## Number of edges: 62945
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9424
## Number of communities: 3
## Elapsed time: 0 seconds

Here we perform dimensionality reduction with tSNE and color the identified clusters by the graph-based clustering algorithm.

opt_perp <- round(sqrt(length(skm@active.ident)),0)
opt_perp
## [1] 49
skm <- RunTSNE(object = skm, dims = 1:5, perplexity = opt_perp, check_duplicates = FALSE)
#skm <- RunUMAP(object = skm, reduction.use = "pca", dims = 1:20, min_dist = 0.75)
DimPlot(object = skm, reduction = 'tsne', pt.size=1.2, label=TRUE, label.size=8)

DimPlot(object = skm, reduction = 'tsne', group.by="orig.ident", pt.size=1.2)

DimPlot(object = skm, reduction = 'tsne', group.by="old.ident", pt.size=1.2) + ggtitle('')

DimPlot(object = skm, reduction = 'pca', group.by="old.ident", pt.size=1.2) + ggtitle('')

DimPlot(object = skm, reduction = 'pca', pt.size=1.2)

It looks like human cells form two very distinct clusters, while the chimp cells are less obviously clustered into two blobs. The forths cluster where the human and chimp cells overlap might be some rare cell population or just poor quality cells, some more thorough analysis is needed here. Good news is that the cycling cells are spread homogeneously across all the clusters and do not form a separate cluster.

Let us check the numbers of cells in each cluster:

table(skm@active.ident)
## 
##    0    1    2 
## 1124 1105  173
cell_assignment<-data.frame(CELL=names(skm@active.ident),CLUSTER=as.numeric(as.character(skm@active.ident)))
print(head(cell_assignment))
##           CELL CLUSTER
## 1 C00004_human       1
## 2 C00009_human       0
## 3 C00011_human       0
## 4 C00014_human       0
## 5 C00018_human       0
## 6 C00024_human       0
write.table(cell_assignment,file="snRNAseq_cell_assignment_human.txt",col.names=TRUE,row.names=FALSE,quote=FALSE,sep="\t")
write.table(expr,file="snRNAseq_Human_SKM_SeuratFiltered.txt",col.names=TRUE,row.names=TRUE,quote=FALSE,sep="\t")

#cell_human_assignment<-cell_assignment[cell_assignment$CLUSTER==0 | cell_assignment$CLUSTER==1,]
#cell_human_assignment<-cell_human_assignment[grepl("human",as.character(cell_human_assignment$CELL))==TRUE,]
#print(head(cell_human_assignment))
#expr_human<-subset(expr,select=colnames(expr)[colnames(expr)%in%as.character(cell_human_assignment$CELL)])
#write.table(cell_human_assignment,file="snRNAseq_cell_human_assignment_type1_type2.txt",col.names=TRUE,row.names=FALSE,quote=FALSE,sep="\t")
#write.table(expr_human,file="snRNAseq_Human_SKM_SeuratFiltered_type1_type2.txt",col.names=TRUE,row.names=TRUE,quote=FALSE,sep="\t")

Let us find markers for each cluster:

# find all markers of cluster 0,1,2
cluster0.markers <- FindMarkers(object = skm, ident.1 = 0, min.pct = 0.25)
head(x = cluster0.markers, n = 10)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## TNNT3  5.346834e-231  2.0673886 0.939 0.488 8.242679e-227
## ATP2A1 1.030568e-168  2.4548282 0.725 0.208 1.588724e-164
## TNNT1  1.430321e-138 -1.6199061 0.538 0.866 2.204983e-134
## MYH7B  1.578637e-120 -2.9602556 0.069 0.509 2.433626e-116
## MYH2   7.873324e-102  1.9119459 0.650 0.268  1.213752e-97
## ATP2A2  4.070713e-95 -1.9998985 0.187 0.587  6.275411e-91
## EMC10   4.604660e-84  0.7936567 0.975 0.858  7.098544e-80
## TPM3    1.449670e-75 -1.7921225 0.166 0.520  2.234812e-71
## MYLPF   1.880193e-73  1.4167275 0.664 0.357  2.898506e-69
## MYH7    9.165042e-73 -1.5146932 0.367 0.687  1.412883e-68
cluster1.markers <- FindMarkers(object = skm, ident.1 = 1, min.pct = 0.25)
head(x = cluster1.markers, n = 10)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## TNNT3  4.572019e-178  -1.926076 0.491 0.876 7.048225e-174
## TNNT1  1.994821e-168   1.693991 0.915 0.540 3.075216e-164
## MYH7B  1.076554e-157   3.095681 0.576 0.071 1.659615e-153
## MYBPC1 1.345624e-128   1.072622 0.951 0.757 2.074413e-124
## ATP2A1 5.487958e-122  -2.239665 0.217 0.648 8.460235e-118
## ATP2A2 1.475253e-118   2.032616 0.645 0.190 2.274251e-114
## TPM3    2.645367e-91   1.789133 0.568 0.172  4.078097e-87
## MYH7    4.138088e-89   1.501196 0.736 0.368  6.379277e-85
## MYH2    1.705771e-78  -1.794970 0.271 0.598  2.629617e-74
## MYLPF   2.052282e-61  -1.359334 0.355 0.625  3.163798e-57
cluster2.markers <- FindMarkers(object = skm, ident.1 = 2, min.pct = 0.25)
head(x = cluster2.markers, n = 10)
##               p_val avg_log2FC pct.1 pct.2    p_val_adj
## EMC10  7.684449e-85  -3.340618 0.237 0.965 1.184635e-80
## COL6A2 1.099849e-83   3.682526 0.260 0.010 1.695528e-79
## TTN    5.727091e-79  -2.124196 0.590 0.995 8.828883e-75
## MEG3   2.551979e-70   3.876199 0.358 0.036 3.934130e-66
## HSPG2  9.272474e-68   3.556684 0.254 0.015 1.429445e-63
## NEAT1  2.438690e-63  -1.442775 0.821 0.995 3.759485e-59
## MYBPC1 1.962046e-51  -2.127907 0.335 0.886 3.024690e-47
## CFD    3.634532e-50   3.698356 0.301 0.038 5.602995e-46
## NEB    4.998392e-45  -1.702654 0.439 0.904 7.705521e-41
## TRDN   8.528503e-45  -2.186616 0.243 0.828 1.314754e-40

..and display violin plots for a couple of cluster markers:

VlnPlot(object = skm, features = c("ATP2A1", "MYH7B", "EMC10"))

Now we can find markers for every cluster compared to all remaining cells, report only the positive ones:

skm.markers <- FindAllMarkers(object = skm, only.pos = TRUE, min.pct = 0.25, test.use = "roc", logfc.threshold = 0.5)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
skm.markers[order(-skm.markers$avg_log2FC),]
##        myAUC  avg_diff power avg_log2FC pct.1 pct.2 cluster   gene
## MYH7B  0.757 2.1457622 0.514  3.0956805 0.576 0.071       1  MYH7B
## ATP2A1 0.798 1.7015572 0.596  2.4548282 0.725 0.208       0 ATP2A1
## TNNT3  0.878 1.4330046 0.756  2.0673886 0.939 0.488       0  TNNT3
## ATP2A2 0.742 1.4089023 0.484  2.0326163 0.645 0.190       1 ATP2A2
## MYH2   0.731 1.3252599 0.462  1.9119459 0.650 0.268       0   MYH2
## TPM3   0.705 1.2401326 0.410  1.7891331 0.568 0.172       1   TPM3
## TNNT1  0.823 1.1741853 0.646  1.6939913 0.915 0.540       1  TNNT1
## MYH7   0.725 1.0405501 0.450  1.5011965 0.736 0.368       1   MYH7
## MYBPC1 0.785 0.7434852 0.570  1.0726224 0.951 0.757       1 MYBPC1
## EMC10  0.729 0.5501209 0.458  0.7936567 0.975 0.858       0  EMC10
#skm.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_logFC)

Looks like the third cluster (with label 2 though) is special as it is depleted for the TTN gene which is highly expressed everywhere but in the third cluster. We can also color the cells on tSNE plot by the marker genes the predominantly express:

#FeaturePlot(object = skm, features = c("ABCA5","TRIM63","EMC10","ATP2A1","MYH7", "MYH7B","ACTN3","MYH1","MYH2"))
#FeaturePlot(object = skm, features = c("TTN","MT.CO1","EMC10","ATP2A1","ATP2A2","MYH7", "MYH7B","MYH1","MYH2"))
FeaturePlot(object = skm, features = c("EMC10","TTN","DCN","ATP2A2","MYH7","MYH7B","ATP2A1","MYH1","MYH2"),min.cutoff=c("q1","q1",NA,NA,NA,NA,NA,NA,NA))

We can also plot the top 20 markers (or all markers if less than 20) for each cluster:

skm.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(object = skm, features = top10$gene) + NoLegend()

The clustering is not very transparent implying that more thorough filtering is needed, alternatively, the cells look biologically very similar. We can assign final labels to clusters based on their gene markers:

#new.cluster.ids <- c("Human Slow Twitch","Human Fast Twitch", "Unclear Cells")
new.cluster.ids <- c("A","B", "C")
names(x = new.cluster.ids) <- levels(x = skm)
skm <- RenameIdents(object = skm, new.cluster.ids)
DimPlot(object = skm, reduction = 'tsne', label = TRUE, pt.size=1, label.size=15) + NoLegend()

DoHeatmap(object = skm, features = top10$gene) + NoLegend()

DimPlot(object = skm, reduction = 'pca', pt.size=1.2)

FeaturePlot(object = skm, features = c("EMC10","TTN","MEG3","ATP2A2","TPM3","MYH7B","ATP2A1","TNNT3","MYH2"),min.cutoff=c("q1","q1","q1",NA,NA,NA,NA,"q1",NA))

#FeaturePlot(object = skm, features = c("EMC10","TTN","DCN","ATP2A2","MYH7","MYH7B","ATP2A1","MYH1","MYH2"),min.cutoff=c("q1","q1","q1",NA,NA,NA,NA,NA,NA))
FeaturePlot(object = skm, features = c("MYH1"))

Save the environment for later usage

Now we will save the working environment for using it later:

save.image(file = "snRNAseq_Human.RData")